Question:

When our member traveling between different states (or country), which fast food/restaurant is most popular ?

Note:

  1. Member home location can be found in member profile. It is changing overtime. So, when matching transaction with member profile, we need match by both member Id and transaction date.

  2. Restaurant and fast food having MCCcode: 5812 and 5814. These code are one column of transaction data.

  3. Restaurant and fast food store location can be found in transcation data.

Solution:

Find all fast food/restaurant transaction that happens in a different state compare with the state of bill( statement ) in that month.

Summarize the fast food/restaurant by their names.

1. Load data from file:

trans1 <- read.csv('Data/April Vintage Transactions page 1.csv', stringsAsFactors=FALSE)
trans2 <- read.csv('Data/April Vintage Transactions page 2.csv', stringsAsFactors=FALSE)
trans2$DebtDimId <- as.character(trans2$DebtDimId)
trans3 <- read.csv('Data/April Vintage Transactions page 3.csv', stringsAsFactors=FALSE)
trans3$DebtDimId <- as.character(trans3$DebtDimId)
trans4 <- read.csv('Data/April Vintage Transactions page 4.csv', stringsAsFactors=FALSE)
trans4$DebtDimId <- as.character(trans4$DebtDimId)

trans.dat <- bind_rows( bind_rows(trans1,trans2),bind_rows(trans3,trans4) )
nrow(trans.dat)
## [1] 197271
head(trans.dat)
##   DebtDimId  TimeDimDt PremierTranCode PremierTranDescr
## 1  24577720 06/14/2013             101             Sale
## 2  24577720 06/14/2013             101             Sale
## 3  24579437 04/14/2013             101             Sale
## 4  24579437 04/14/2013             101             Sale
## 5  24579683 04/08/2013             101             Sale
## 6  24579683 04/08/2013             101             Sale
##                              FDRMerchDescr TransactionAmt MCCcode
## 1 TAG*TAGGED INC           866-217-8131 CA          10.00    8699
## 2 RALEY'S #236             OROVILLE     CA          35.71    5411
## 3 CS*SANTANDER CONSUM      866-724-2743 CA         199.95    7399
## 4 FAST LANE # 4744         JACKSON      MS          21.05    5542
## 5 BAR LOUIE GRAND RA       GRAND RAPIDS MI           8.42    5812
## 6 TGI FRIDAYS #1590        KENTWOOD     MI           4.00    5812
##                                          MCCCategory
## 1 Professional Services and Membership Organizations
## 2                                      Retail Stores
## 3                                  Business Services
## 4                               Automobiles/Vehicles
## 5                               Miscellaneous Stores
## 6                               Miscellaneous Stores
##                                       MCCDescription
## 1 Membership Organizations not elsewhere classified 
## 2                      Grocery Stores, Supermarkets 
## 3        Business Services not elsewhere classified 
## 4                          Automated Fuel Dispenser 
## 5                        Eating Places, Restaurants 
## 6                        Eating Places, Restaurants
summary(trans.dat)
##   DebtDimId          TimeDimDt         PremierTranCode PremierTranDescr  
##  Length:197271      Length:197271      Min.   :101.0   Length:197271     
##  Class :character   Class :character   1st Qu.:101.0   Class :character  
##  Mode  :character   Mode  :character   Median :101.0   Mode  :character  
##                                        Mean   :102.5                     
##                                        3rd Qu.:101.0                     
##                                        Max.   :305.0                     
##                                        NA's   :1                         
##  FDRMerchDescr      TransactionAmt        MCCcode     MCCCategory       
##  Length:197271      Min.   :-4132.08   Min.   :   0   Length:197271     
##  Class :character   1st Qu.:    7.00   1st Qu.:5411   Class :character  
##  Mode  :character   Median :   15.25   Median :5542   Mode  :character  
##                     Mean   :   27.82   Mean   :5740                     
##                     3rd Qu.:   32.52   3rd Qu.:5814                     
##                     Max.   : 4132.08   Max.   :9754                     
##                     NA's   :2          NA's   :2                        
##  MCCDescription    
##  Length:197271     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
profile1 <- read.csv('Data/April Vintage 2013.csv', stringsAsFactors = FALSE)
profile2 <- read.csv('Data/April Vintage 2013 _ page 2.csv', stringsAsFactors = FALSE)
prof.dat <- bind_rows(profile1, profile2)
nrow(prof.dat)
## [1] 173253
head(prof.dat)
##   DebtDimId AccountDimId  FDROpenDt DatePrevStmt StatementDt MOB ExtStatus
## 1  24508718     65089656 04/02/2013         NULL  04/08/2013   0          
## 2  24508718     65089656 04/02/2013   04/08/2013  05/09/2013   1          
## 3  24508718     65089656 04/02/2013   05/09/2013  06/07/2013   2          
## 4  24508718     65089656 04/02/2013   06/07/2013  07/09/2013   3          
## 5  24508718     65089656 04/02/2013   07/09/2013  08/09/2013   4          
## 6  24508718     65089656 04/02/2013   08/09/2013  09/08/2013   5          
##   IntStatus DaysDeliq ActualMinPay OverlimitAmount DisplayMinPay
## 1                   0           30               0            30
## 2                   0           30               0            30
## 3                   0           30               0            30
## 4                   0           30               0            30
## 5                   0           30               0            30
## 6                   0           30               0            30
##   OpeningBalance BillLateCharge EndingBalance CreditLimit TotalNetPayments
## 1           0.00              0        180.00         300                0
## 2         180.00              0        154.50         300               30
## 3         154.50              0        159.74         300               30
## 4         159.74              0        267.92         300               30
## 5         267.92              0        245.81         300               30
## 6         245.81              0        223.06         300               30
##   TotalNetPurchaseAndCash TotalFeesBilled Concessions QuarterlyFicoScore
## 1                    0.00            0.00           0                  0
## 2                    0.00            4.50           0                536
## 3                   19.58           15.66           0                536
## 4                  130.00            8.18           0                536
## 5                    0.00            7.89           0                536
## 6                    0.00            7.25           0                515
##   BehavScore                         City ZState State ZipCode X24856314
## 1         11 NORTH LAS VEGAS                  NV    NV   89031        NA
## 2        636 NORTH LAS VEGAS                  NV    NV   89031        NA
## 3        639 NORTH LAS VEGAS                  NV    NV   89031        NA
## 4        644 NORTH LAS VEGAS                  NV    NV   89031        NA
## 5        648 NORTH LAS VEGAS                  NV    NV   89031        NA
## 6        657 NORTH LAS VEGAS                  NV    NV   89031        NA
##   X65742793 X04.18.2013 NULL. X04.21.2013 X0    X  X.1 X0.1 X30 X0.2 X30.1
## 1        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
## 2        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
## 3        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
## 4        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
## 5        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
## 6        NA        <NA>  <NA>        <NA> NA <NA> <NA>   NA  NA   NA    NA
##   X0.3 X0.4 X100 X400 X0.5 X0.6 X0.7 X0.8 X0.9 X11 OXNARD   CA CA.1 X93032
## 1   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
## 2   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
## 3   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
## 4   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
## 5   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
## 6   NA   NA   NA   NA   NA   NA   NA   NA   NA  NA   <NA> <NA> <NA>     NA
summary(prof.dat)
##    DebtDimId         AccountDimId       FDROpenDt        
##  Min.   :24508718   Min.   :65089656   Length:173253     
##  1st Qu.:24797902   1st Qu.:65639131   Class :character  
##  Median :24817093   Median :65675029   Mode  :character  
##  Mean   :24806966   Mean   :65711462                     
##  3rd Qu.:24836872   3rd Qu.:65709823                     
##  Max.   :24856301   Max.   :69639949                     
##  NA's   :80623      NA's   :80623                        
##  DatePrevStmt       StatementDt             MOB         ExtStatus        
##  Length:173253      Length:173253      Min.   : 0.00   Length:173253     
##  Class :character   Class :character   1st Qu.: 3.00   Class :character  
##  Mode  :character   Mode  :character   Median : 7.00   Mode  :character  
##                                        Mean   : 7.27                     
##                                        3rd Qu.:12.00                     
##                                        Max.   :17.00                     
##                                        NA's   :80623                     
##   IntStatus           DaysDeliq       ActualMinPay    OverlimitAmount  
##  Length:173253      Min.   :  0.00   Min.   :   0.0   Min.   :   0.00  
##  Class :character   1st Qu.:  0.00   1st Qu.:  30.0   1st Qu.:   0.00  
##  Mode  :character   Median :  0.00   Median :  30.0   Median :   0.00  
##                     Mean   : 25.54   Mean   :  63.2   Mean   :  28.39  
##                     3rd Qu.: 30.00   3rd Qu.:  60.0   3rd Qu.:  30.19  
##                     Max.   :390.00   Max.   :1237.0   Max.   :2199.63  
##                     NA's   :80623    NA's   :80623    NA's   :80623    
##  DisplayMinPay     OpeningBalance   BillLateCharge  EndingBalance   
##  Min.   :-401.01   Min.   :-500.6   Min.   : 0.00   Min.   :-500.6  
##  1st Qu.:  30.00   1st Qu.: 175.0   1st Qu.: 0.00   1st Qu.: 203.2  
##  Median :  33.71   Median : 304.6   Median : 0.00   Median : 310.6  
##  Mean   :  84.99   Mean   : 333.3   Mean   : 8.55   Mean   : 351.4  
##  3rd Qu.:  90.00   3rd Qu.: 441.9   3rd Qu.:25.00   3rd Qu.: 455.9  
##  Max.   :2374.63   Max.   :2499.6   Max.   :35.00   Max.   :2499.6  
##  NA's   :80623     NA's   :80623    NA's   :80623   NA's   :80623   
##   CreditLimit    TotalNetPayments  TotalNetPurchaseAndCash
##  Min.   :  0.0   Min.   :-817.77   Min.   :-2007.74       
##  1st Qu.:300.0   1st Qu.:   0.00   1st Qu.:    0.00       
##  Median :300.0   Median :  30.00   Median :    0.00       
##  Mean   :428.7   Mean   :  60.25   Mean   :   59.46       
##  3rd Qu.:700.0   3rd Qu.:  68.97   3rd Qu.:   63.88       
##  Max.   :900.0   Max.   :2770.07   Max.   : 2488.63       
##  NA's   :80623   NA's   :80623     NA's   :80623          
##  TotalFeesBilled    Concessions      QuarterlyFicoScore   BehavScore   
##  Min.   :-359.35   Min.   :-607.87   Min.   :  0.0      Min.   : 11.0  
##  1st Qu.:   5.90   1st Qu.:   0.00   1st Qu.:513.0      1st Qu.:556.0  
##  Median :  15.05   Median :   0.00   Median :576.0      Median :624.0  
##  Mean   :  22.31   Mean   :  -0.74   Mean   :521.2      Mean   :532.5  
##  3rd Qu.:  38.55   3rd Qu.:   0.00   3rd Qu.:626.0      3rd Qu.:651.0  
##  Max.   : 190.52   Max.   : 414.31   Max.   :798.0      Max.   :717.0  
##  NA's   :80623     NA's   :80623     NA's   :80623      NA's   :80623  
##      City              ZState             State              ZipCode     
##  Length:173253      Length:173253      Length:173253      Min.   :  605  
##  Class :character   Class :character   Class :character   1st Qu.:29102  
##  Mode  :character   Mode  :character   Mode  :character   Median :46224  
##                                                           Mean   :51290  
##                                                           3rd Qu.:77703  
##                                                           Max.   :99775  
##                                                           NA's   :80623  
##    X24856314          X65742793        X04.18.2013       
##  Min.   :24856314   Min.   :65512291   Length:173253     
##  1st Qu.:24865229   1st Qu.:65753728   Class :character  
##  Median :24876408   Median :65772563   Mode  :character  
##  Mean   :24876981   Mean   :65821277                     
##  3rd Qu.:24887693   3rd Qu.:65793608                     
##  Max.   :24901159   Max.   :69687240                     
##  NA's   :132942     NA's   :132942                       
##     NULL.           X04.21.2013              X0              X            
##  Length:173253      Length:173253      Min.   : 0.00    Length:173253     
##  Class :character   Class :character   1st Qu.: 3.00    Class :character  
##  Mode  :character   Mode  :character   Median : 7.00    Mode  :character  
##                                        Mean   : 7.82                      
##                                        3rd Qu.:12.00                      
##                                        Max.   :17.00                      
##                                        NA's   :132942                     
##      X.1                 X0.1             X30               X0.2        
##  Length:173253      Min.   :  0.00   Min.   :   0.00   Min.   :   0.00  
##  Class :character   1st Qu.:  0.00   1st Qu.:  30.00   1st Qu.:   0.00  
##  Mode  :character   Median :  0.00   Median :  38.00   Median :   0.00  
##                     Mean   : 20.16   Mean   :  63.86   Mean   :  25.18  
##                     3rd Qu.: 30.00   3rd Qu.:  60.00   3rd Qu.:  15.25  
##                     Max.   :390.00   Max.   :1219.00   Max.   :1430.43  
##                     NA's   :132942   NA's   :132942    NA's   :132942   
##      X30.1              X0.3             X0.4             X100       
##  Min.   :   0.00   Min.   :-603.5   Min.   : 0.00    Min.   :-603.5  
##  1st Qu.:  30.00   1st Qu.: 175.0   1st Qu.: 0.00    1st Qu.: 219.8  
##  Median :  39.00   Median : 358.7   Median : 0.00    Median : 378.7  
##  Mean   :  84.01   Mean   : 391.7   Mean   : 7.87    Mean   : 415.6  
##  3rd Qu.:  72.77   3rd Qu.: 645.1   3rd Qu.:25.00    3rd Qu.: 656.5  
##  Max.   :2130.43   Max.   :2130.4   Max.   :35.00    Max.   :2130.4  
##  NA's   :132942    NA's   :132942   NA's   :132942   NA's   :132942  
##       X400             X0.5              X0.6              X0.7        
##  Min.   :  0.0    Min.   :-885.60   Min.   :-500.00   Min.   :-277.17  
##  1st Qu.:300.0    1st Qu.:   0.00   1st Qu.:   0.00   1st Qu.:   6.84  
##  Median :500.0    Median :  40.00   Median :   0.00   Median :  19.22  
##  Mean   :520.3    Mean   :  66.19   Mean   :  66.05   Mean   :  24.20  
##  3rd Qu.:700.0    3rd Qu.:  75.00   3rd Qu.:  68.97   3rd Qu.:  38.88  
##  Max.   :900.0    Max.   :1994.14   Max.   :1683.99   Max.   : 208.95  
##  NA's   :132942   NA's   :132942    NA's   :132942    NA's   :132942   
##       X0.8              X0.9             X11            OXNARD         
##  Min.   :-379.71   Min.   :  0.0    Min.   : 11.0    Length:173253     
##  1st Qu.:   0.00   1st Qu.:538.0    1st Qu.:586.0    Class :character  
##  Median :   0.00   Median :598.0    Median :633.0    Mode  :character  
##  Mean   :  -0.73   Mean   :553.4    Mean   :552.4                      
##  3rd Qu.:   0.00   3rd Qu.:640.0    3rd Qu.:655.0                      
##  Max.   : 246.40   Max.   :791.0    Max.   :715.0                      
##  NA's   :132942    NA's   :132942   NA's   :132942                     
##       CA                CA.1               X93032      
##  Length:173253      Length:173253      Min.   :  683   
##  Class :character   Class :character   1st Qu.:27616   
##  Mode  :character   Mode  :character   Median :44504   
##                                        Mean   :49464   
##                                        3rd Qu.:77063   
##                                        Max.   :99802   
##                                        NA's   :132942
rm(trans1, trans2, trans3, trans4, profile1, profile2)

2. Table 1: Id, Date(start, end), Home state

remove record without DebtDimId data

temp <- prof.dat[-which(is.na(prof.dat$DebtDimId)),]

Replace NULL DatePrevStmt by FDROpenDt if it is the first record. Replace NULL DatePrevStmt by StatementDt - 30 if it is not the first record

idx <- which(( is.na(temp$DatePrevStmt) | temp$DatePrevStmt == 'NULL') & temp$MOB == 0)
temp[idx,]$DatePrevStmt <- temp[idx,]$FDROpenDt

idx <- which(is.na(temp$DatePrevStmt) | temp$DatePrevStmt == 'NULL' )
temp[idx,]$DatePrevStmt <- format( as.Date(temp[idx,]$StatementDt, "%m/%d/%Y") - 30, format="%m/%d/%Y")

prof.PrevStmt.fix <- temp
rm(temp)

Group user profile data by : Id, State

temp <- prof.PrevStmt.fix %>%
  group_by(DebtDimId, State) %>%
  summarise(StartDay = min(as.Date(DatePrevStmt, "%m/%d/%Y" )),
          EndDay = max(as.Date(StatementDt, "%m/%d/%Y"))) %>%
  select(one_of(c('DebtDimId','State', 'StartDay', 'EndDay')))

length(unique(prof.dat$DebtDimId))
## [1] 7039
nrow(temp)
## [1] 7271
tbl.Member.HomeState <- temp
rm(temp, prof.PrevStmt.fix)

3. Table 2: DebtDimId, Restaurant name, Locate State, TransDate

All transaction with MCCcode: 5812 or 5814

temp <- trans.dat[which(trans.dat$MCCcode==5812 | trans.dat$MCCcode==5814),]

Remove transactions in which FDRMerchDescr contains phone number

phone.reg.pattern = '\\s*(?:\\+?(\\d{1,3}))?[- (]*(\\d{3})[- )]*(\\d{3})[- ]*(\\d{4})(?: *[x/#]{1}(\\d+))?\\s*'
idx <- grep(phone.reg.pattern, temp$FDRMerchDescr)
if(length(idx) != 0)
{
  temp <- temp[-idx,]  
}

rm(phone.reg.pattern, idx)

Get State code from FDRMerchDescr

state.reg.pattern = '[[:alpha:]]{2}$'
store.loc <- str_extract(temp$FDRMerchDescr, state.reg.pattern)
idx.incorrect.loc <- which(! (store.loc %in% state.abb))
store.loc[idx.incorrect.loc] <- -1
rm(state.reg.pattern, idx.incorrect.loc)

Store Name 1: FDRMerchDescr - Remove city and state

store.name <- temp$FDRMerchDescr
tbDelete.reg.pattern = '[[:space:]]*[[:alpha:]]{2,} [[:alpha:]]{2}$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 2: Remove string after two space (or more)

tbDelete.reg.pattern = '[[:space:]]{2,}[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 3: Remove string after digit

tbDelete.reg.pattern = '[[:space:]]*[[:alpha:]]*\\d+[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 4: Remove string after #

tbDelete.reg.pattern = '[[:space:]]*#[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 5: Remove string after @

tbDelete.reg.pattern = '[[:space:]]*@[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 6: Remove string after -

tbDelete.reg.pattern = '[[:space:]]*-[[:print:]]*$'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Store Name 7: Remove all non alpha beta charactor

This can solve the difference between ‘Wendy’s’ and ‘Wendy s’

tbDelete.reg.pattern = '[[:punct:][:space:]]*'
store.name <- gsub(tbDelete.reg.pattern, '', store.name)

Bind member id, transaction day, store name, store loc

member.id <- temp$DebtDimId
transaction.Day <- temp$TimeDimDt
tbl.clean.trans <- as.data.frame(cbind( cbind(member.id, transaction.Day), cbind(store.name, store.loc) ))

rm(member.id, transaction.Day, store.name, store.loc)

4. Fill Home state into transaction table

HomeState <- NULL
MissingList <- NULL
DuplicateList <- NULL
for(i in 1:nrow(tbl.clean.trans)){
  transId <- tbl.clean.trans$member.id[i]
  transDay <- as.Date(tbl.clean.trans$transaction.Day[i], "%m/%d/%Y")
  # Id.Idx <- which(tbl.Member.HomeState$DebtDimId == transId)
  # StartDay.Idx <- which(tbl.Member.HomeState$StartDay <= transDay)
  # EndDay.Idx <- which(tbl.Member.HomeState$EndDay >= transDay)
  #   
  # s <-Id.Idx[which(Id.Idx %in% StartDay.Idx &
  #                    Id.Idx %in% EndDay.Idx)]
  
  Hs.Idx <- which(  tbl.Member.HomeState$DebtDimId == transId &
                    tbl.Member.HomeState$StartDay <= transDay &
                    tbl.Member.HomeState$EndDay >= transDay)
  if(is.null(Hs.Idx) | length(Hs.Idx) == 0){
    MissingList <- c(MissingList, i)
    HomeState <- c(HomeState, 'MISS')
    next
  }
  
  if(length(Hs.Idx) > 1){
    DuplicateList <- c(DuplicateList, i)
    HomeState <- c(HomeState, 'DUPLICATE')
    next
  }
  
  HomeState <- c(HomeState, tbl.Member.HomeState$State[Hs.Idx])
}
length(MissingList)
## [1] 11859
length(DuplicateList)
## [1] 29
length(HomeState)
## [1] 40453

5. Find out the transation which happens in a state different with the home state

tmp.dat <- cbind(tbl.clean.trans, HomeState)

trans.idx <- which( tmp.dat$HomeState != 'MISS' &
                    tmp.dat$HomeState != 'DUPLICATE' &
                   # tmp.dat$store.loc != "NULL" &
                    as.character(tmp.dat$HomeState) != as.character(tmp.dat$store.loc))
rm(tmp.dat)

Answer:

MC Donalds is the most popular restaunt when people traveling out their state.